# US Senate Press Events network: 
#   R code for generating tables and graphics for
#   "Scraping public co-occurrences for statistical network analysis 
#    of political elites"
# 
# 
# Created by Paasha Mahdavi
# This version: 9 July 2017
#
#-----------------------------------------------------------------------------#
#    Loading packages and data                                                #
#-----------------------------------------------------------------------------#

rm(list = ls())

# Install required packages
pkg <- c("TeachingDemos","statnet","stargazer","miscTools","stringr","texreg","ggplot2",
         "gridExtra","dplyr")
inst <- pkg %in% installed.packages()
if (length(pkg[!inst]) > 0) install.packages(pkg[!inst])
rm(pkg, inst)

# Load required packages 
library("TeachingDemos")
library("statnet")
library("stargazer")
library("miscTools")
library("stringr")
library("texreg")
library("ggplot2")
library("gridExtra")
library("dplyr")



#-----------------------------------------------------------------------------#
#    Setting up code for generating log file                                  #
#-----------------------------------------------------------------------------#

txtStart(file = "senatepress-replication.log", commands = TRUE, results = TRUE)



#-----------------------------------------------------------------------------#
#    Loading and analyzing scraped data                                       #
#-----------------------------------------------------------------------------#

#Reading in data after scraping #1 (google API)

senate <- read.csv("senatepress1.csv",header = F)


#Reading in data after scraping #2 (google API)

senate.rob.2 <- read.csv("senatepress2.csv",header = F)


#Reading in data after scraping #3 (google API)

senate.rob.3 <- read.csv("senatepress3.csv",header = F)


#Reading in data after scraping #4 (google API)

senate.rob.4 <- read.csv("senatepress4.csv",header = F)


#Reading in data after scraping #5 (google API)

senate.rob.5 <- read.csv("senatepress5.csv",header = F)


# Cleaning up file arrangement (results are in even rows, names are in odd rows)

senate$Results <- NA
senate.rob.2$Results <- NA
senate.rob.3$Results <- NA
senate.rob.4$Results <- NA
senate.rob.5$Results <- NA

for(i in seq(2,10506,2)){
  senate$Results[i-1] <- as.numeric(as.character(senate$V2[i]))
  senate.rob.2$Results[i-1] <- as.numeric(as.character(senate.rob.2$V2[i]))
  senate.rob.3$Results[i-1] <- as.numeric(as.character(senate.rob.3$V2[i]))
  senate.rob.4$Results[i-1] <- as.numeric(as.character(senate.rob.4$V2[i]))
  senate.rob.5$Results[i-1] <- as.numeric(as.character(senate.rob.5$V2[i]))
}

senate <- senate[senate$V1!=unique(senate$V1)[2],]
senate.rob.2 <- senate.rob.2[senate.rob.2$V1!=unique(senate.rob.2$V1)[2],]
senate.rob.3 <- senate.rob.3[senate.rob.3$V1!=unique(senate.rob.3$V1)[2],]
senate.rob.4 <- senate.rob.4[senate.rob.4$V1!=unique(senate.rob.4$V1)[2],]
senate.rob.5 <- senate.rob.5[senate.rob.5$V1!=unique(senate.rob.5$V1)[2],]


# Re-naming and removing extra \" \"

names(senate) <- c("name1","name2","hits")
senate$name1 <- gsub('\"','',senate$name1)
senate$name2 <- gsub('\"','',senate$name2)

names(senate.rob.2) <- c("name1","name2","hits")
senate.rob.2$name1 <- gsub('\"','',senate.rob.2$name1)
senate.rob.2$name2 <- gsub('\"','',senate.rob.2$name2)

names(senate.rob.3) <- c("name1","name2","hits")
senate.rob.3$name1 <- gsub('\"','',senate.rob.3$name1)
senate.rob.3$name2 <- gsub('\"','',senate.rob.3$name2)

names(senate.rob.4) <- c("name1","name2","hits")
senate.rob.4$name1 <- gsub('\"','',senate.rob.4$name1)
senate.rob.4$name2 <- gsub('\"','',senate.rob.4$name2)

names(senate.rob.5) <- c("name1","name2","hits")
senate.rob.5$name1 <- gsub('\"','',senate.rob.5$name1)
senate.rob.5$name2 <- gsub('\"','',senate.rob.5$name2)



# Network data (for first dataset)

senate.b <- senate[1:5151,c("name2","name1","hits")]

names(senate.b) <- c("name1","name2","hits")

senate2 <- data.frame(rbind(senate,senate.b))

senate2 <- senate2 %>% arrange(name2)

senate2 <- senate2 %>% arrange(name1)


# Making into network matrix (adjacency or sociomatrix)

nwk.s.df <- reshape(senate2, v.names="hits", timevar="name2", 
                    direction="wide", idvar="name1")

nwk.s <- as.matrix(nwk.s.df[1:nrow(nwk.s.df),2:ncol(nwk.s.df)])

nwk <- network(nwk.s, 
               directed=FALSE, 
               matrix.type="a", 
               ignore.eval=FALSE, 
               names.eval="hits")

senate3 <- as.matrix(nwk, attrname="hits",matrix.type="edgelist")

nwk1 <- nwk
delete.edges(nwk1, seq_along(nwk1$mel))
nwk1[senate3[,1:2], names.eval="hits", add.edges=TRUE] <- senate2[,3]


# Edgelist from network 

senate.edge <- as.matrix(nwk1, attrname="hits",matrix.type="edgelist")


# Centrality scores

cent <- data.frame(names = nwk.s.df$name1)
cent$evcent <- evcent(nwk1, ignore.eval=FALSE)
cent$between <- betweenness(nwk1, ignore.eval=FALSE)
cent$degrees <- degree(nwk1, ignore.eval=FALSE)



#-----------------------------------------------------------------------------#
#    Data from Desmarais et al                                                #
#-----------------------------------------------------------------------------#

# Loading Desmarais, Moscardelli, Schaffner, Kowal (dmsk) data

load("netList.RData")


# Fixing row/column names

senator.names <- read.csv(file = "Desmarais-names.csv",header=T)
senator.names <- senator.names %>% arrange(order)


# Matrix of 110th senate

senate.dmsk <- netList$`110`$amat

rownames(senate.dmsk) <- senator.names[,3]
colnames(senate.dmsk) <- senator.names[,3]


# Network data

nwk.dmsk <- network(senate.dmsk, 
                    directed=TRUE, 
                    matrix.type="a", 
                    ignore.eval=FALSE, 
                    names.eval="hits")


# Edgelist

senate.edge.dmsk <- as.matrix(nwk.dmsk, attrname="hits",matrix.type="edgelist")


# Centrality scores

cent.dmsk <- data.frame(names = senator.names[,3])
cent.dmsk$evcent <- evcent(nwk.dmsk, ignore.eval=FALSE)
cent.dmsk$between <- betweenness(nwk.dmsk, ignore.eval=FALSE)
cent.dmsk$degrees <- degree(nwk.dmsk, ignore.eval=FALSE)



#-----------------------------------------------------------------------------#
#    Comparing network matrices                                               #
#-----------------------------------------------------------------------------#

# Scraped data

scraped.sen.data <- senate
names(scraped.sen.data) <- c("name1","name2","gchits")

scraped.sen.data.2 <- senate.rob.2
names(scraped.sen.data.2) <- c("name1","name2","gchits")

scraped.sen.data.3 <- senate.rob.3
names(scraped.sen.data.3) <- c("name1","name2","gchits")

scraped.sen.data.4 <- senate.rob.4
names(scraped.sen.data.4) <- c("name1","name2","gchits")

scraped.sen.data.5 <- senate.rob.5
names(scraped.sen.data.5) <- c("name1","name2","gchits")


# DMSK data

dmsk.data <- as.data.frame(senate.edge.dmsk)
dmsk.names <- as.data.frame(attr(senate.edge.dmsk,"vnames"))
dmsk.names$order <- rownames(dmsk.names)

dmsk.data <- merge(dmsk.data,dmsk.names,by.x="V1",by.y="order")
dmsk.data <- merge(dmsk.data,dmsk.names,by.x="V2",by.y="order")

dmsk.data <- dmsk.data[,c(4,5,3)]
names(dmsk.data) <- c("name1","name2","pressevents")


# Combined data

combined <- merge(x = scraped.sen.data, 
                  y = dmsk.data, 
                  by = c("name1", "name2"),
                  all.x = T,
                  all.y = T)

combined <- merge(x = combined, 
                  y = dmsk.data, 
                  by.x = c("name1", "name2"), 
                  by.y = c("name2", "name1"), 
                  all.x = T, 
                  all.y = T)

combined <- merge(x = combined,
                  y = scraped.sen.data, 
                  by.x = c("name1", "name2"), 
                  by.y = c("name2", "name1"), 
                  all.x = T, 
                  all.y = T)


# Google (scraped) hits

combined$gchits <- ifelse(is.na(combined$gchits.x),
                          combined$gchits.y,
                          combined$gchits.x)


# DMSK press events

combined$pressevents <- ifelse(is.na(combined$pressevents.x),
                               combined$pressevents.y,
                               combined$pressevents.x)


# Replacing NA with 0 (i.e., no google hits or visits)

combined$gchits <- ifelse(is.na(combined$gchits),
                          0,
                          combined$gchits)

combined$pressevents <- ifelse(is.na(combined$pressevents),
                               0,
                               combined$pressevents)


# Removing looped pairs (i.e., self-hits or "a-a" pairs)

dat <- combined[combined$name1!=combined$name2,]



# Dataset 2: Repeating process of combining data

combined.2 <- merge(scraped.sen.data.2, dmsk.data,
                    by = c("name1", "name2"), 
                    all.x = T, all.y = T)

combined.2 <- merge(combined.2, dmsk.data,
                    by.x = c("name1", "name2"), 
                    by.y = c("name2", "name1"), 
                    all.x = T, all.y = T)

combined.2 <- merge(combined.2, scraped.sen.data.2, 
                    by.x = c("name1", "name2"), 
                    by.y = c("name2", "name1"), 
                    all.x = T, all.y = T)


combined.2$gchits <- ifelse(is.na(combined.2$gchits.x),
                            combined.2$gchits.y,
                            combined.2$gchits.x)

combined.2$pressevents <- ifelse(is.na(combined.2$pressevents.x),
                                 combined.2$pressevents.y,
                                 combined.2$pressevents.x)

combined.2$gchits <- ifelse(is.na(combined.2$gchits),
                            0,
                            combined.2$gchits)

combined.2$pressevents <- ifelse(is.na(combined.2$pressevents),
                                 0,
                                 combined.2$pressevents)

dat.2 <- combined.2[combined.2$name1!=combined.2$name2,]


#Dataset 3: Repeating process of combining data

combined.3 <- merge(scraped.sen.data.3, dmsk.data,
                    by = c("name1", "name2"), 
                    all.x = T, all.y = T)

combined.3 <- merge(combined.3, dmsk.data, 
                    by.x = c("name1", "name2"),
                    by.y = c("name2", "name1"), 
                    all.x = T, all.y = T)

combined.3 <- merge(combined.3, scraped.sen.data.3, 
                    by.x = c("name1", "name2"),
                    by.y = c("name2", "name1"), 
                    all.x = T, all.y = T)


combined.3$gchits <- ifelse(is.na(combined.3$gchits.x),
                            combined.3$gchits.y,
                            combined.3$gchits.x)

combined.3$pressevents <- ifelse(is.na(combined.3$pressevents.x),
                                 combined.3$pressevents.y,
                                 combined.3$pressevents.x)

combined.3$gchits <- ifelse(is.na(combined.3$gchits),
                            0,
                            combined.3$gchits)

combined.3$pressevents <- ifelse(is.na(combined.3$pressevents),
                                 0,
                                 combined.3$pressevents)

dat.3 <- combined.3[combined.3$name1!=combined.3$name2,]


# Dataset 4: Repeating process of combining data

combined.4 <- merge(scraped.sen.data.4, dmsk.data, 
                    by = c("name1", "name2"), 
                    all.x = T, all.y = T)

combined.4 <- merge(combined.4, dmsk.data, 
                    by.x = c("name1", "name2"), 
                    by.y = c("name2", "name1"), 
                    all.x = T, all.y = T)

combined.4 <- merge(combined.4, scraped.sen.data.4, 
                    by.x = c("name1", "name2"), 
                    by.y = c("name2", "name1"), 
                    all.x = T, all.y = T)


combined.4$gchits <- ifelse(is.na(combined.4$gchits.x),
                            combined.4$gchits.y,
                            combined.4$gchits.x)

combined.4$pressevents <- ifelse(is.na(combined.4$pressevents.x),
                                 combined.4$pressevents.y,
                                 combined.4$pressevents.x)

combined.4$gchits <- ifelse(is.na(combined.4$gchits),
                            0,
                            combined.4$gchits)

combined.4$pressevents <- ifelse(is.na(combined.4$pressevents),
                                 0,
                                 combined.4$pressevents)


dat.4 <- combined.4[combined.4$name1!=combined.4$name2,]


# Dataset 5: Repeating process of combining data

combined.5 <- merge(scraped.sen.data.5, dmsk.data, 
                    by = c("name1", "name2"),
                    all.x = T, all.y = T)

combined.5 <- merge(combined.5, dmsk.data,
                    by.x = c("name1", "name2"), 
                    by.y = c("name2", "name1"), 
                    all.x = T, all.y = T)

combined.5 <- merge(combined.5, scraped.sen.data.5, 
                    by.x = c("name1", "name2"),
                    by.y = c("name2", "name1"), 
                    all.x = T, all.y = T)


combined.5$gchits <- ifelse(is.na(combined.5$gchits.x),
                            combined.5$gchits.y,
                            combined.5$gchits.x)

combined.5$pressevents <- ifelse(is.na(combined.5$pressevents.x),
                                 combined.5$pressevents.y,
                                 combined.5$pressevents.x)


combined.5$gchits <- ifelse(is.na(combined.5$gchits),
                            0,
                            combined.5$gchits)

combined.5$pressevents <- ifelse(is.na(combined.5$pressevents),
                                 0,
                                 combined.5$pressevents)


dat.5 <- combined.5[combined.5$name1!=combined.5$name2,]



#-----------------------------------------------------------------------------#
#    Appendix Figure 3                                                        #
#-----------------------------------------------------------------------------#
#
# <------ NOTE: points are jittered for visual clarity ------>
#
ggplot(data = dat[dat$pressevents<9,], 
       aes(x = jitter(pressevents), y = jitter(log(gchits+1)))) + 
  geom_point(alpha=0.5,col="lightgray") + 
  stat_smooth(method="lm",level = 0.99) + 
  labs(x = "Co-occurrences at Senate press events", 
       y = "Co-occurrences at social/political events (logged) \n ") + 
  coord_cartesian(ylim=c(1,10)) + 
  scale_y_continuous(breaks=seq(2,10,2)) +
  theme_bw()

ggsave(filename = "AppendixFigure3.pdf", width = 8.5, height = 7, units = "in")




#-----------------------------------------------------------------------------#
#    Appendix Table 1                                                         #
#-----------------------------------------------------------------------------#

# OLS regression

ols1 <- lm( log( gchits + 1 ) ~ log( pressevents + 1 ), data = dat )

ols2 <- lm( log( gchits + 1 ) ~ log( pressevents + 1 ), data = dat.2 )

ols3 <- lm( log( gchits + 1 ) ~ log( pressevents + 1 ), data = dat.3 )

ols4 <- lm( log( gchits + 1 ) ~ log( pressevents + 1 ), data = dat.4 )

ols5 <- lm( log( gchits + 1 ) ~ log( pressevents + 1 ), data = dat.5 )


# LaTeX table

cat("### --- Appendix Table 1 --- ###")


stargazer(ols1, ols2, ols3, ols4, ols5,
          covariate.labels = "Press events, logged",
          dep.var.labels = "Google hits, logged",
          report = "vc*st",
          omit = "Constant",
          keep.stat = "n")



#-----------------------------------------------------------------------------#
#    Stop generating log file                                                 #
#-----------------------------------------------------------------------------#

txtStop()
